data_clean

Final Project: Alternate Runs Data-set

Experimental design:  Switching every 4 trials in an alternating runs manner (no cues). Total of 8 experimental blocks were intended, but 60 subjects saw only 7 blocks. Across these blocks counterbalancing of four conditions: Both tasks unambiguous (1), both tasks ambiguous (4), shape ambiguous when irrelevant, color always ambiguous (2), shape always ambiguous, color unambiguous when irrelevant (2)

What the variables mean:

  • block: 0 for practice, there are a total of 8 blocks per part and each block consists of 112 trials. So 896 trials per participant over all 7 blocks (not counting practice)

  • bal: counterbalancing of conditions across blocks

  • x, y, c2: irrelevant (already taken out)

  • cycle: counting within full alternating cycle (8), switch at 1 and 5

  • task: 1=shape, 2=color

  • dimshape=specific shapes–4=neutral

  • dimcolor=specific color–4=neutral

  • correct: correct response (i.e., value of the currently relevant task dimension)

  • error: 0 = no error, 1 = yes error

  • response: actual response

  • RT: or response time

Import data-set and some packages

#open neccesary packages here
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(readr)
library(rio)
library(psych) #generate metrix w scatterplot and cor

Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha
#import dataset
AlternateRuns <- read_csv("AlternateRuns.csv")
Rows: 94754 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): id, bal, block, x, cond, trial, y, c2, cycle, task, dim1, dim2, co...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(AlternateRuns)

Clean-Data

Rename Columns to Understand Better

#rename columns to understand better
alt_run <- AlternateRuns %>% 
  rename(dimshape = dim1, dimcolor = dim2, RT = time, correct = cor, response = res)
alt_run
# A tibble: 94,754 × 16
      id   bal block     x  cond trial     y    c2 cycle  task dimshape dimcolor
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1     1     1     0     0     1     1     0     1     1     1        1        4
 2     1     1     0     0     1     2     0     1     2     1        2        4
 3     1     1     0     0     1     3     0     1     3     1        1        4
 4     1     1     0     0     1     4     0     1     4     1        2        4
 5     1     1     0     0     1     5     0     1     5     2        4        1
 6     1     1     0     0     1     6     0     1     6     2        4        2
 7     1     1     0     0     1     7     0     1     7     2        4        3
 8     1     1     0     0     1     8     0     1     8     2        4        2
 9     1     1     0     0     1     9     0     1     1     1        1        4
10     1     1     0     0     1    10     0     1     2     1        3        4
# ℹ 94,744 more rows
# ℹ 4 more variables: correct <dbl>, error <dbl>, response <dbl>, RT <dbl>

Replace Numeric Values w Character Strings for Task and Error

This is set to a new df in case we want to use the variables for task or error in a graph as a character.

# replace numeric values w character strings for task and error
alt_run_1 <- alt_run %>% mutate(task = recode(task, `1` = "shape", `2` = 'color')) %>% 
  mutate(error = recode(error, `0` = "no", `1` = 'yes'))
alt_run_1
# A tibble: 94,754 × 16
      id   bal block     x  cond trial     y    c2 cycle task  dimshape dimcolor
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>    <dbl>    <dbl>
 1     1     1     0     0     1     1     0     1     1 shape        1        4
 2     1     1     0     0     1     2     0     1     2 shape        2        4
 3     1     1     0     0     1     3     0     1     3 shape        1        4
 4     1     1     0     0     1     4     0     1     4 shape        2        4
 5     1     1     0     0     1     5     0     1     5 color        4        1
 6     1     1     0     0     1     6     0     1     6 color        4        2
 7     1     1     0     0     1     7     0     1     7 color        4        3
 8     1     1     0     0     1     8     0     1     8 color        4        2
 9     1     1     0     0     1     9     0     1     1 shape        1        4
10     1     1     0     0     1    10     0     1     2 shape        3        4
# ℹ 94,744 more rows
# ℹ 4 more variables: correct <dbl>, error <chr>, response <dbl>, RT <dbl>

Remove Practice Trials in Block Variable + pivot func

#removing practice trials from our df
alt_run <- alt_run %>% 
  filter(block != 0) %>% 
  print()
# A tibble: 83,440 × 16
      id   bal block     x  cond trial     y    c2 cycle  task dimshape dimcolor
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1     1     1     1     0     1     1     0     1     1     1        3        4
 2     1     1     1     0     1     2     0     1     2     1        3        4
 3     1     1     1     0     1     3     0     1     3     1        1        4
 4     1     1     1     0     1     4     0     1     4     1        2        4
 5     1     1     1     0     1     5     0     1     5     2        4        1
 6     1     1     1     0     1     6     0     1     6     2        4        1
 7     1     1     1     0     1     7     0     1     7     2        4        3
 8     1     1     1     0     1     8     0     1     8     2        4        1
 9     1     1     1     0     1     9     0     1     1     1        1        4
10     1     1     1     0     1    10     0     1     2     1        3        4
# ℹ 83,430 more rows
# ℹ 4 more variables: correct <dbl>, error <dbl>, response <dbl>, RT <dbl>
#use pivot long and/or? wide here with some key variables we want to look at. may need to alter df to turn some 1s and 0s in columns to be names... (correct, incorrect or color, shape). fix code below...

# alt_run %>% 
#   pivot_wider(names_from = task, values_from = block)

Determine and Remove Outliers (Error way…)

# we are testing for accuracy, so we need at least 80% accuracy in all trials per participant 
#determine 80% accuracy 
crit <- 896 - (896 * .8)
crit # need at least 179 out of 896 trials to be correct, denoted by 0 in error col
[1] 179.2
sum_er <- alt_run %>% 
  group_by(id) %>% 
summarize(sum = sum(error)) %>% 
  print()
# A tibble: 99 × 2
      id   sum
   <dbl> <dbl>
 1     1    39
 2     2    89
 3     3    34
 4     4    33
 5     5     4
 6     6    16
 7     7    26
 8     8    39
 9     9    25
10    10    21
# ℹ 89 more rows
sum_er <- sum_er %>% 
  mutate(outlier_er =  (sum > crit)) %>% 
  print()
# A tibble: 99 × 3
      id   sum outlier_er
   <dbl> <dbl> <lgl>     
 1     1    39 FALSE     
 2     2    89 FALSE     
 3     3    34 FALSE     
 4     4    33 FALSE     
 5     5     4 FALSE     
 6     6    16 FALSE     
 7     7    26 FALSE     
 8     8    39 FALSE     
 9     9    25 FALSE     
10    10    21 FALSE     
# ℹ 89 more rows
sum_er <- sum_er %>% 
  filter(outlier_er == !FALSE) %>% 
  print() #two people fall below 80% accuracy
# A tibble: 2 × 3
     id   sum outlier_er
  <dbl> <dbl> <lgl>     
1    70   181 TRUE      
2    87   184 TRUE      
#removing those outliers here 

alt_run <- alt_run %>% 
  filter(id != 70, id != 87)

Determine and Remove inter-response Outliers

#lets look at the top RTs. 
alt_run %>% 
  arrange(desc(RT))
# A tibble: 81,648 × 16
      id   bal block     x  cond trial     y    c2 cycle  task dimshape dimcolor
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1     3     1     4     0     4    78     0     4     6     2        3        2
 2    17     1     4     0     4     9     0     4     1     1        1        2
 3    26     2     1     0     4     1     0     4     1     1        3        3
 4    41     1     1     0     1    81     0     1     1     1        3        4
 5    41     1     4     0     4   101     0     4     5     2        2        2
 6    48     2     7     0     3    73     0     3     1     1        1        4
 7    54     2     1     0     4    11     0     4     3     1        1        3
 8    54     2     1     0     4    48     0     4     8     2        2        3
 9    54     2     6     0     2     1     0     2     1     1        2        1
10    56     2     3     0     2    17     0     2     1     1        1        3
# ℹ 81,638 more rows
# ℹ 4 more variables: correct <dbl>, error <dbl>, response <dbl>, RT <dbl>
#WHAT TO DO: z-score on each seq position x switch x ambiguity on RT then z-score on each block (to account for some participants only doing 7 instead of 8 blocks) 

#STEP 1: separate switch trials, c(1,5) and control trials !c(1,5)

alt_run <- alt_run %>% 
  mutate(switchcyc = if_else(cycle %in% c(1,5), cycle, NA_real_) , controlcyc = if_else(!cycle %in% c(1,5), cycle, NA_real_))
alt_run
# A tibble: 81,648 × 18
      id   bal block     x  cond trial     y    c2 cycle  task dimshape dimcolor
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1     1     1     1     0     1     1     0     1     1     1        3        4
 2     1     1     1     0     1     2     0     1     2     1        3        4
 3     1     1     1     0     1     3     0     1     3     1        1        4
 4     1     1     1     0     1     4     0     1     4     1        2        4
 5     1     1     1     0     1     5     0     1     5     2        4        1
 6     1     1     1     0     1     6     0     1     6     2        4        1
 7     1     1     1     0     1     7     0     1     7     2        4        3
 8     1     1     1     0     1     8     0     1     8     2        4        1
 9     1     1     1     0     1     9     0     1     1     1        1        4
10     1     1     1     0     1    10     0     1     2     1        3        4
# ℹ 81,638 more rows
# ℹ 6 more variables: correct <dbl>, error <dbl>, response <dbl>, RT <dbl>,
#   switchcyc <dbl>, controlcyc <dbl>
#STEP 2: Isolate RTs for switch trials, look for outliers through z-score

#STEP 3: Isolate RTs for control trials, look for outliers through z-score

#STEP 4: Start steps for z-score on each block

Descriptive Graphs

1. Histogram of RT

mean_rt <- mean(alt_run$RT, na.rm = TRUE)
mean_rt
[1] 860.4441
sd_rt <-  sd(alt_run$RT, na.rm = TRUE)


alt_run %>% 
  ggplot(aes(x=RT)) +
  geom_histogram(aes(y = after_stat(density)), fill = 'darkgreen', color = 'darkblue') +
  geom_vline(aes(xintercept = mean_rt) , color = 'red', linetype = 'dashed', size = 1.5) +
    theme_minimal() +
  stat_function(fun = dnorm, args = list(mean = mean_rt, sd = sd_rt) ,  col = 'gold', size = 1.5) +
    labs(x= 'Response Times (ms)', y= 'Density', title = 'Density plot of Response Times', subtitle = 'The mean and normal density curve of RTs') 
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plotly::ggplotly()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2. Boxplot of RT

#boxplot of all RTs regardless of task
boxplot(alt_run_1$RT)

#boxplot of RTs when doing shape task
boxplot_s <- filter(alt_run_1, task == 'shape')
boxplot(boxplot_s$RT)

#boxplot of RTs when doing color task
boxplot_c <- filter(alt_run_1, task == 'color')
boxplot(boxplot_c$RT)

3. Correlations

cor_alt <- alt_run %>% 
  select(RT, cycle, task)
  cor(cor_alt, use = "complete.obs")
               RT      cycle        task
RT     1.00000000 -0.1627983 -0.05256423
cycle -0.16279828  1.0000000  0.87287156
task  -0.05256423  0.8728716  1.00000000
#is there a correlation between response times and error rate? also note: used the psych package to generate this
  alt_run %>% 
    select(RT, error) %>%
    pairs.panels(lm = TRUE)

5. Scatterplots in select

  • What is the relationship between error and response time?

  • What is the position of the cycle and the relationship from that to the response time?

  • What are the dynamics of switching tasks? (5th cycle or 1st)

  • Is there a difference in response time when people swtich from task to another?

#inspect alt_run dataset
head(alt_run)
# A tibble: 6 × 18
     id   bal block     x  cond trial     y    c2 cycle  task dimshape dimcolor
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1     1     1     1     0     1     1     0     1     1     1        3        4
2     1     1     1     0     1     2     0     1     2     1        3        4
3     1     1     1     0     1     3     0     1     3     1        1        4
4     1     1     1     0     1     4     0     1     4     1        2        4
5     1     1     1     0     1     5     0     1     5     2        4        1
6     1     1     1     0     1     6     0     1     6     2        4        1
# ℹ 6 more variables: correct <dbl>, error <dbl>, response <dbl>, RT <dbl>,
#   switchcyc <dbl>, controlcyc <dbl>
#relationship between error and response time
#There are only two conditions of error: 0= No error, 1= Yes error
#Makes scatterplots relations with RT on 2 linear lines. 
alt_run %>%
  ggplot(aes(RT,error))+
  geom_point()

#position of cylce and relationship with response time
#relationship between Response Time and Cycle also produces some scatterplots
#output is liner and not sure of what it says about the data
alt_run %>% 
  ggplot(aes(RT,cycle))+
  geom_point()

#position of cycle in relation to response time?
alt_run %>% 
  ggplot(aes(cycle,RT)) +
  geom_point()

narrow df for plotting

alt_select <- alt_run %>% 
  select(task, trial,cycle,response,error,RT)
head(alt_select)
# A tibble: 6 × 6
   task trial cycle response error    RT
  <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>
1     1     1     1        3     0   547
2     1     2     2        3     0   476
3     1     3     3        1     0   620
4     1     4     4        2     0   620
5     2     5     5        3     1   780
6     2     6     6        1     0   484
# task and response time
alt_select %>% 
  ggplot(aes(RT,task))+
  geom_point()

6. Selecting : already did?

7. Pivoting

8. Descriptives table